{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chinese-standard.jsonl.dedup\r\n",
      "filter-malay-rojak-en.jsonl.dedup\r\n",
      "filter-malay-rojak-ms.jsonl.dedup\r\n",
      "filter-malay-rojak-rojak.jsonl.dedup\r\n",
      "filter-twitter-id.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-id.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-ms.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-rojak.jsonl.dedup\r\n",
      "kaskus.jsonl.dedup\r\n",
      "local-mandarin.jsonl.dedup\r\n",
      "others.jsonl.dedup\r\n",
      "others-sample.jsonl.dedup\r\n",
      "prepare-english-en.jsonl.dedup\r\n",
      "prepare-english-ms.jsonl.dedup\r\n",
      "prepare-indon.jsonl.dedup\r\n",
      "prepare-indon-standard.jsonl.dedup\r\n",
      "prepare-malay-ms.jsonl.dedup\r\n",
      "prepare-manglish-en.jsonl.dedup\r\n",
      "prepare-manglish-manglish.jsonl.dedup\r\n",
      "prepare-standard-mandarin.jsonl.dedup\r\n",
      "standard-mandarin.jsonl.dedup\r\n"
     ]
    }
   ],
   "source": [
    "!ls *.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = open('train-fasttext.txt', 'w')\n",
    "test = open('test-fasttext.txt', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_english = [\n",
    "    'prepare-english-en.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:01, 644937.66it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_english:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__standard-english {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_english = [\n",
    "    'prepare-manglish-en.jsonl.dedup',\n",
    "    'filter-malay-rojak-en.jsonl.dedup',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:01, 665364.98it/s]\n",
      "1314it [00:00, 764292.81it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in local_english:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__local-english {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "manglish = [\n",
    "    'prepare-manglish-manglish.jsonl.dedup',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:01, 524529.28it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in manglish:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__manglish {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_indonesian = [\n",
    "    'prepare-indon-standard.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2992230 prepare-indon-standard.jsonl.dedup\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l prepare-indon-standard.jsonl.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:05, 178287.59it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_indonesian:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__standard-indonesian {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2195647 prepare-indon.jsonl.dedup\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l prepare-indon.jsonl.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "socialmedia_indonesian = [\n",
    "    'prepare-indon.jsonl.dedup',\n",
    "    'filter-twitter-malay-rojak-id.jsonl.dedup',\n",
    "    'kaskus.jsonl.dedup',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "499999it [00:00, 501901.64it/s]\n",
      "499999it [00:00, 810585.26it/s]\n",
      "499999it [00:00, 758577.09it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in socialmedia_indonesian:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__socialmedia-indonesian {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 5e5:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_malay = [\n",
    "    'prepare-english-ms.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:01, 649749.77it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_malay:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__standard-malay {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_malay = [\n",
    "    'filter-malay-rojak-ms.jsonl.dedup',\n",
    "    'filter-twitter-malay-rojak-ms.jsonl.dedup',\n",
    "    'filter-twitter-malay-rojak-ms.jsonl.dedup',\n",
    "    'filter-malay-rojak-rojak.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "299999it [00:00, 760412.35it/s]\n",
      "299999it [00:00, 831285.55it/s]\n",
      "299999it [00:00, 826388.35it/s]\n",
      "299999it [00:00, 542768.66it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in local_malay:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__local-malay {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 3e5:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "standard_mandarin = [\n",
    "    'chinese-standard.jsonl.dedup', \n",
    "    'standard-mandarin.jsonl.dedup', \n",
    "    'prepare-standard-mandarin.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def reject(ori):\n",
    "    s = ori.lower()\n",
    "    s = re.sub('[^A-Za-z ]+', ' ', s)\n",
    "    s = re.sub(r'[ ]+', ' ', s).strip()\n",
    "    return (len(s) / len(ori)) > 0.05"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "79664it [00:00, 352637.20it/s]\n",
      "561451it [00:01, 299466.85it/s]\n",
      "587233it [00:15, 37368.40it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in standard_mandarin:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "                \n",
    "            if reject(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__standard-mandarin {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 5e5:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reject(\"\\u5356\\u7684\\u4eba \\u8d5a\\u94b1\\u5f88\\u6709\\u6548\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_mandarin = [\n",
    "    'local-mandarin.jsonl.dedup'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"\\u5356\\u7684\\u4eba \\u8d5a\\u94b1\\u5f88\\u6709\\u6548\"\r\n",
      "\"\\u5514\\u7559\\u624b \\u53d1\\u8868\\u4e8e am \\u5982\\u53ea\\u8ad6lcci\\u61c9\\u8a72\\u662feu\\u5427\\u2026\\u2026\\u3002 \\u6211\\u8bfb\\u5b8clcci \\u8981\\u4feeacca\\u7684\"\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 local-mandarin.jsonl.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1521190it [00:09, 164639.62it/s]\n"
     ]
    }
   ],
   "source": [
    "for file in local_mandarin:\n",
    "    count = 0\n",
    "    with open(file) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l).strip()\n",
    "            if not len(data):\n",
    "                continue\n",
    "                \n",
    "            if reject(data):\n",
    "                continue\n",
    "\n",
    "            if random.random() >= 0.95:\n",
    "                f = test\n",
    "            else:\n",
    "                f = train\n",
    "\n",
    "            t = f'__label__local-mandarin {data}'\n",
    "            f.write(f'{t}\\n')\n",
    "            count += 1\n",
    "            if count >= 1e6:\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__label__local-mandarin 我支持马华。。。的对手\r\n",
      "__label__local-mandarin 站桩 很多种，太极内功桩法就十来种。 姿势是否正确很关键。\r\n",
      "__label__local-mandarin 尊孔国中儿女 把根留住\r\n"
     ]
    }
   ],
   "source": [
    "!tail -n 3 train-fasttext.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = 0\n",
    "with open('others-sample.jsonl.dedup') as fopen:\n",
    "    for l in fopen:\n",
    "        data = json.loads(l).strip()\n",
    "        if not len(data):\n",
    "            continue\n",
    "            \n",
    "        if random.random() >= 0.95:\n",
    "            f = test\n",
    "        else:\n",
    "            f = train\n",
    "        \n",
    "        t = f'__label__other {data}'\n",
    "        f.write(f'{t}\\n')\n",
    "        count += 1\n",
    "        if count >= 1e7:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "test.close()\n",
    "train.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train-fasttext.txt > shuf-train-fasttext.txt\n",
    "!shuf test-fasttext.txt > shuf-test-fasttext.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__label__standard-english the bn needs to identify local issues that can be overcome rather than bring national sentiment which is seen as less capable of attracting voters sentiment there he said\r\n",
      "__label__standard-english she has appeared in the sitcom rock as the son of jenna maroney and has also appeared in the independent film fort tilden\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 train-fasttext.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "train_label = defaultdict(int)\n",
    "test_label = defaultdict(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10500345it [00:19, 527689.13it/s] \n"
     ]
    }
   ],
   "source": [
    "with open('train-fasttext.txt') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        train_label[l.split()[0]] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'__label__standard-english': 950224,\n",
       "             '__label__local-english': 950885,\n",
       "             '__label__manglish': 950352,\n",
       "             '__label__standard-indonesian': 949852,\n",
       "             '__label__socialmedia-indonesian': 1424860,\n",
       "             '__label__standard-malay': 949951,\n",
       "             '__label__local-malay': 1140123,\n",
       "             '__label__standard-mandarin': 1024328,\n",
       "             '__label__local-mandarin': 950180,\n",
       "             '__label__other': 1209590})"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "552946it [00:01, 530178.66it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'__label__standard-english': 49776,\n",
       "             '__label__local-english': 50429,\n",
       "             '__label__manglish': 49648,\n",
       "             '__label__standard-indonesian': 50148,\n",
       "             '__label__socialmedia-indonesian': 75140,\n",
       "             '__label__standard-malay': 50049,\n",
       "             '__label__local-malay': 59877,\n",
       "             '__label__standard-mandarin': 53709,\n",
       "             '__label__local-mandarin': 49820,\n",
       "             '__label__other': 64350})"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('test-fasttext.txt') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        test_label[l.split()[0]] += 1\n",
    "        \n",
    "test_label"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
